Version 2023.11.21_11.24.15 - focuses on genes found only relative to TCGA, not relative to any other cohort. Previous versions focused on outliers detected relative to TCGA and not treehouse, irrespective of whatever other cohorts they were outliers in
Version 2023.11.29_09.19.33 - focuses on all druggable genes, not only genes that were outliers vs one cohort or another
Version 2023.11.30_16.40.01 - incorporates pan disease cohorts
outliers <- read_tsv("../input_data/druggable_outliers_from_treehouse_and_other_cohorts_2023_11_09-13_46_32_2023.tsv") %>%
mutate(high_level_cohort = ifelse(str_detect(comparison_cohort, "Treehouse"),
"Treehouse",
comparison_cohort))
## Rows: 287 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Sample_ID, comparison_cohort, gene, donor_ID
## lgl (1): pathway_support
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
outlier_genes_detected <- unique(outliers$gene)
v11_expr <- read_tsv("../input_data/druggable_TumorCompendium_v11_PolyA_hugo_log2tpm_58581genes_2020-04-09.tsv.gz") %>%
rename(Sample_ID = TH_id) %>%
mutate(ever_outlier_in_ckcc2 = Gene %in% outlier_genes_detected)
## Rows: 1414917 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene, TH_id
## dbl (1): log2TPM1
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
stanford_samples <- read_tsv("../gather_input_data/comparison_to_non_CARE_cohorts/data/TH03_TH34_rollup.sample_list.txt",
col_names = "Sample_ID") %>%
mutate(cohort = "TH03_TH34")
## Rows: 110 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Sample_ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
TCGA_samples <- read_tsv("../gather_input_data/comparison_to_non_CARE_cohorts/data/TCGA_rollup.sample_list.txt",
col_names = "Sample_ID") %>%
mutate(cohort = "TCGA")
## Rows: 9806 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Sample_ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
PEDAYA_samples <- read_tsv("../gather_input_data/comparison_to_non_CARE_cohorts/data/PEDAYA_rollup.sample_list.txt",
col_names = "Sample_ID") %>%
mutate(cohort = "PEDAYA")
## Rows: 2814 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Sample_ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pd_cohorts <- read_tsv("../input_data/pan_disease_cohort_members_2023_11_30-16_37_31_2023.tsv") %>%
rename(original_cohort_name = cohort,
focus_sample_ID = TH_id,
Sample_ID = cohort_member) %>%
mutate(
cohort_pd_subset = str_replace(original_cohort_name,
"first_degree_mcs_cohort", "pan_disease_1st_degree") %>%
str_replace("first_and_second_degree_mcs_cohort", "pan_disease_1st_and_2nd_degree") %>%
str_replace("diagnosed_disease_cohort", "pan_disease_same_diagnosis") %>%
str_replace("pandisease_samples", "pan_disease_same_inferred_diagnosis"),
cohort = paste(focus_sample_ID, cohort_pd_subset))
## Rows: 69829 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (3): cohort, TH_id, cohort_member
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# dput(unique(pd_cohorts$cohort))
n_distinct(pd_cohorts$cohort)
## [1] 116
v8_expr <- read_tsv("../input_data/v8_expr_for_ckcc2.tsv.gz") %>%
mutate(ever_outlier_in_ckcc2 = Gene %in% outlier_genes_detected)
## Rows: 3280536 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene, Sample_ID
## dbl (1): log2TPM1
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
v10_expr <- read_tsv("../input_data/v10_expr_for_ckcc2.tsv.gz") %>%
mutate(ever_outlier_in_ckcc2 = Gene %in% outlier_genes_detected)
## Rows: 234324 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene, Sample_ID
## dbl (1): log2TPM1
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
expr <- bind_rows(
v10_expr,
v8_expr,
v11_expr) %>%
distinct()
pan_cancer_samples <- expr %>%
select(Sample_ID) %>%
distinct() %>%
mutate(cohort = "Treehouse_pc")
samples_in_cohorts <- bind_rows(
stanford_samples,
TCGA_samples,
PEDAYA_samples,
pan_cancer_samples,
pd_cohorts %>%
select(cohort, Sample_ID))
tabyl(samples_in_cohorts,
cohort)
## cohort n percent
## PEDAYA 2814 2.950830e-02
## TCGA 9806 1.028281e-01
## TH03_TH34 110 1.153487e-03
## TH34_1150_S02 pan_disease_1st_and_2nd_degree 100 1.048625e-03
## TH34_1150_S02 pan_disease_1st_degree 52 5.452849e-04
## TH34_1150_S02 pan_disease_same_diagnosis 77 8.074410e-04
## TH34_1150_S02 pan_disease_same_inferred_diagnosis 77 8.074410e-04
## TH34_1162_S01 pan_disease_1st_and_2nd_degree 1843 1.932615e-02
## TH34_1162_S01 pan_disease_1st_degree 80 8.388998e-04
## TH34_1162_S01 pan_disease_same_diagnosis 72 7.550098e-04
## TH34_1162_S01 pan_disease_same_inferred_diagnosis 72 7.550098e-04
## TH34_1163_S01 pan_disease_1st_and_2nd_degree 423 4.435683e-03
## TH34_1163_S01 pan_disease_1st_degree 48 5.033399e-04
## TH34_1163_S01 pan_disease_same_diagnosis 58 6.082023e-04
## TH34_1163_S01 pan_disease_same_inferred_diagnosis 99 1.038138e-03
## TH34_1179_S01 pan_disease_same_diagnosis 290 3.041012e-03
## TH34_1238_S01 pan_disease_1st_and_2nd_degree 120 1.258350e-03
## TH34_1238_S01 pan_disease_1st_degree 6 6.291748e-05
## TH34_1238_S01 pan_disease_same_diagnosis 265 2.778856e-03
## TH34_1238_S01 pan_disease_same_inferred_diagnosis 265 2.778856e-03
## TH34_1239_S01 pan_disease_1st_and_2nd_degree 582 6.102996e-03
## TH34_1239_S01 pan_disease_1st_degree 96 1.006680e-03
## TH34_1239_S01 pan_disease_same_diagnosis 420 4.404224e-03
## TH34_1239_S01 pan_disease_same_inferred_diagnosis 687 7.204052e-03
## TH34_1240_S01 pan_disease_1st_and_2nd_degree 63 6.606336e-04
## TH34_1240_S01 pan_disease_1st_degree 2 2.097249e-05
## TH34_1240_S01 pan_disease_same_diagnosis 73 7.654961e-04
## TH34_1240_S01 pan_disease_same_inferred_diagnosis 73 7.654961e-04
## TH34_1349_S01 pan_disease_same_diagnosis 730 7.654961e-03
## TH34_1349_S02 pan_disease_same_diagnosis 730 7.654961e-03
## TH34_1350_S01 pan_disease_1st_and_2nd_degree 620 6.501473e-03
## TH34_1350_S01 pan_disease_1st_degree 92 9.647348e-04
## TH34_1350_S01 pan_disease_same_diagnosis 422 4.425196e-03
## TH34_1350_S01 pan_disease_same_inferred_diagnosis 698 7.319401e-03
## TH34_1351_S01 pan_disease_1st_and_2nd_degree 5470 5.735977e-02
## TH34_1351_S01 pan_disease_1st_degree 137 1.436616e-03
## TH34_1351_S01 pan_disease_same_diagnosis 151 1.583423e-03
## TH34_1351_S01 pan_disease_same_inferred_diagnosis 151 1.583423e-03
## TH34_1352_S01 pan_disease_1st_and_2nd_degree 339 3.554838e-03
## TH34_1352_S01 pan_disease_1st_degree 3 3.145874e-05
## TH34_1352_S01 pan_disease_same_inferred_diagnosis 4 4.194499e-05
## TH34_1379_S01 pan_disease_1st_and_2nd_degree 4777 5.009280e-02
## TH34_1379_S01 pan_disease_1st_degree 45 4.718811e-04
## TH34_1379_S01 pan_disease_same_diagnosis 666 6.983841e-03
## TH34_1379_S01 pan_disease_same_inferred_diagnosis 1025 1.074840e-02
## TH34_1380_S01 pan_disease_1st_and_2nd_degree 157 1.646341e-03
## TH34_1380_S01 pan_disease_1st_degree 8 8.388998e-05
## TH34_1380_S01 pan_disease_same_diagnosis 45 4.718811e-04
## TH34_1380_S01 pan_disease_same_inferred_diagnosis 45 4.718811e-04
## TH34_1381_S01 pan_disease_1st_and_2nd_degree 866 9.081090e-03
## TH34_1381_S01 pan_disease_1st_degree 27 2.831287e-04
## TH34_1381_S01 pan_disease_same_diagnosis 27 2.831287e-04
## TH34_1381_S01 pan_disease_same_inferred_diagnosis 838 8.787475e-03
## TH34_1399_S01 pan_disease_1st_and_2nd_degree 1371 1.437665e-02
## TH34_1399_S01 pan_disease_1st_degree 12 1.258350e-04
## TH34_1399_S01 pan_disease_same_diagnosis 316 3.313654e-03
## TH34_1399_S01 pan_disease_same_inferred_diagnosis 92 9.647348e-04
## TH34_1400_S01 pan_disease_1st_and_2nd_degree 3486 3.655506e-02
## TH34_1400_S01 pan_disease_1st_degree 28 2.936149e-04
## TH34_1400_S01 pan_disease_same_diagnosis 50 5.243124e-04
## TH34_1400_S01 pan_disease_same_inferred_diagnosis 302 3.166847e-03
## TH34_1412_S01 pan_disease_1st_and_2nd_degree 5 5.243124e-05
## TH34_1412_S01 pan_disease_1st_degree 4 4.194499e-05
## TH34_1412_S01 pan_disease_same_diagnosis 316 3.313654e-03
## TH34_1412_S01 pan_disease_same_inferred_diagnosis 5 5.243124e-05
## TH34_1414_S01 pan_disease_1st_and_2nd_degree 183 1.918983e-03
## TH34_1414_S01 pan_disease_1st_degree 33 3.460462e-04
## TH34_1414_S01 pan_disease_same_diagnosis 42 4.404224e-04
## TH34_1414_S01 pan_disease_same_inferred_diagnosis 42 4.404224e-04
## TH34_1415_S01 pan_disease_1st_and_2nd_degree 5605 5.877542e-02
## TH34_1415_S01 pan_disease_1st_degree 65 6.816061e-04
## TH34_1415_S01 pan_disease_same_diagnosis 324 3.397544e-03
## TH34_1415_S01 pan_disease_same_inferred_diagnosis 1101 1.154536e-02
## TH34_1444_S01 pan_disease_1st_and_2nd_degree 892 9.353733e-03
## TH34_1444_S01 pan_disease_1st_degree 69 7.235511e-04
## TH34_1444_S01 pan_disease_same_inferred_diagnosis 514 5.389931e-03
## TH34_1445_S02 pan_disease_1st_and_2nd_degree 1342 1.407254e-02
## TH34_1445_S02 pan_disease_1st_degree 558 5.851326e-03
## TH34_1445_S02 pan_disease_same_diagnosis 798 8.368025e-03
## TH34_1445_S02 pan_disease_same_inferred_diagnosis 798 8.368025e-03
## TH34_1446_S01 pan_disease_1st_and_2nd_degree 886 9.290815e-03
## TH34_1446_S01 pan_disease_1st_degree 62 6.501473e-04
## TH34_1446_S01 pan_disease_same_diagnosis 798 8.368025e-03
## TH34_1446_S01 pan_disease_same_inferred_diagnosis 799 8.378512e-03
## TH34_1447_S01 pan_disease_1st_and_2nd_degree 997 1.045479e-02
## TH34_1447_S01 pan_disease_1st_degree 6 6.291748e-05
## TH34_1447_S01 pan_disease_same_diagnosis 364 3.816994e-03
## TH34_1447_S01 pan_disease_same_inferred_diagnosis 823 8.630182e-03
## TH34_1447_S02 pan_disease_same_diagnosis 364 3.816994e-03
## TH34_1452_S01 pan_disease_1st_and_2nd_degree 234 2.453782e-03
## TH34_1452_S01 pan_disease_1st_degree 2 2.097249e-05
## TH34_1452_S01 pan_disease_same_diagnosis 21 2.202112e-04
## TH34_1452_S01 pan_disease_same_inferred_diagnosis 38 3.984774e-04
## TH34_1455_S01 pan_disease_1st_and_2nd_degree 433 4.540545e-03
## TH34_1455_S01 pan_disease_1st_degree 2 2.097249e-05
## TH34_1455_S01 pan_disease_same_diagnosis 21 2.202112e-04
## TH34_1455_S01 pan_disease_same_inferred_diagnosis 602 6.312721e-03
## TH34_1456_S02 pan_disease_1st_and_2nd_degree 2845 2.983337e-02
## TH34_1456_S02 pan_disease_1st_degree 33 3.460462e-04
## TH34_1456_S02 pan_disease_same_diagnosis 156 1.635855e-03
## TH34_1456_S02 pan_disease_same_inferred_diagnosis 1170 1.226891e-02
## TH34_2292_S01 pan_disease_same_diagnosis 415 4.351793e-03
## TH34_2293_S01 pan_disease_1st_and_2nd_degree 4586 4.808993e-02
## TH34_2293_S01 pan_disease_1st_degree 33 3.460462e-04
## TH34_2293_S01 pan_disease_same_diagnosis 415 4.351793e-03
## TH34_2293_S01 pan_disease_same_inferred_diagnosis 325 3.408030e-03
## TH34_2351_S01 pan_disease_1st_and_2nd_degree 1725 1.808878e-02
## TH34_2351_S01 pan_disease_1st_degree 25 2.621562e-04
## TH34_2351_S01 pan_disease_same_diagnosis 50 5.243124e-04
## TH34_2351_S01 pan_disease_same_inferred_diagnosis 107 1.122028e-03
## TH34_2410_S01 pan_disease_1st_and_2nd_degree 3818 4.003649e-02
## TH34_2410_S01 pan_disease_1st_degree 35 3.670187e-04
## TH34_2410_S01 pan_disease_same_diagnosis 439 4.603463e-03
## TH34_2410_S01 pan_disease_same_inferred_diagnosis 337 3.533865e-03
## TH34_2411_S01 pan_disease_1st_and_2nd_degree 3420 3.586297e-02
## TH34_2411_S01 pan_disease_1st_degree 116 1.216405e-03
## TH34_2411_S01 pan_disease_same_diagnosis 188 1.971414e-03
## TH34_2411_S01 pan_disease_same_inferred_diagnosis 327 3.429003e-03
## TH34_2666_S01 pan_disease_same_diagnosis 443 4.645408e-03
## Treehouse_pc 12804 1.342659e-01
rsem_path <- "../input_data/non_compendium_expression"
gene_name_conversion <- read_tsv(file.path(rsem_path,
"EnsGeneID_Hugo_Observed_Conversions.txt"))
## Rows: 60498 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): HugoID, EnsGeneID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
relevant_gene_name_conversion <- gene_name_conversion %>%
filter(HugoID %in% outlier_genes_detected)
rsem_kitchen_sink_data <- tibble(file_name = list.files(
path = rsem_path,
pattern = "_rsem_genes.results")) %>%
rowwise() %>%
mutate(rsem_raw = list(read_tsv(file.path(rsem_path, file_name),
show_col_types = FALSE
))) %>%
unnest(rsem_raw) %>%
filter(gene_id %in% relevant_gene_name_conversion$EnsGeneID) %>%
mutate(Sample_ID = str_extract(file_name, "TH[R]?[0-9]{2}_[0-9]{4}_S[0-9]{2}")) %>%
left_join(relevant_gene_name_conversion,
by=c("gene_id"="EnsGeneID")) %>%
group_by(Sample_ID, HugoID) %>%
summarize(sum_TPM = sum(TPM),
n=n()) %>%
mutate(log2TPM1 = log2(sum_TPM +1))
## `summarise()` has grouped output by 'Sample_ID'. You can override using the
## `.groups` argument.
table(rsem_kitchen_sink_data$n)
##
## 1 2
## 275 5
patient_expression_from_rsem_files <- rsem_kitchen_sink_data %>%
select(gene = HugoID,
log2TPM1,
Sample_ID)
patient_expression_from_compendia <- outliers %>%
select(Sample_ID, gene) %>%
distinct() %>%
left_join(expr,
by=c("Sample_ID", "gene"="Gene")) %>%
na.omit() # excludes samples not in compendium
patient_expression <- bind_rows(
patient_expression_from_rsem_files,
patient_expression_from_compendia)
length(outlier_genes_detected)
## [1] 56
# v8 <- read_tsv("https://xena.treehouse.gi.ucsc.edu/download/TreehousePEDv8_clinical_metadata.2018-07-25.tsv")
# v9 <- read_tsv("https://xena.treehouse.gi.ucsc.edu/download/TreehousePEDv9_clinical_metadata.2019-03-15.tsv")
# v10 <- read_tsv("https://xena.treehouse.gi.ucsc.edu/download/clinical_TumorCompendium_v10_PolyA_2020-01-28.ts
# v")
#
# samples_in_cohorts %>%
# filter(cohort == "TH34_1351_S01 pan_disease_1st_and_2nd_degree")
#
# samples_in_cohorts %>%
# filter(! Sample_ID %in% expr$Sample_ID) %>%
# select(Sample_ID) %>%
# distinct()
# samples_in_cohorts %>%
# filter(! Sample_ID %in% expr$Sample_ID) %>%
# select(Sample_ID) %>%
# distinct() %>%
# pull(Sample_ID) %>%
# cat(sep = " ")
#
# samples_in_cohorts %>%
# filter(! Sample_ID %in% expr$Sample_ID) %>%
# filter(! Sample_ID %in% v8$th_sampleid) %>%
# filter(! Sample_ID %in% v10$th_sampleid) %>%
# select(Sample_ID) %>%
# distinct()
# Conclusion
# all but TH34_1456_S02 are in v8
# TH34_1456_S02 is not in v9
# TH34_1456_S02 is in v10
# are all in v10? no. so add data from v8 and v10
servers were going very slowly, so i got v8 and v10 on different servers (razzmatazz and crimson)
setwd("/scratch/hbeale/")
Sys.setenv("VROOM_CONNECTION_SIZE" = 131072 * 2)
library(tidyverse)
v8 <- read_tsv("https://xena.treehouse.gi.ucsc.edu/download/TreehousePEDv8_unique_hugo_log2_tpm_plus_1.2018-07-25.tsv")
v10 <- read_tsv("https://xena.treehouse.gi.ucsc.edu/download/TumorCompendium_v10_PolyA_hugo_log2tpm_58581genes_2019-07-25.tsv")
samples_with_expr_to_retrieve <- "TH03_0296_S05 THR14_0307_S01 THR14_1181_S01 THR14_1182_S01 THR14_1185_S01 THR14_1196_S01 THR14_1197_S01 THR14_1203_S01 THR14_1213_S01 THR14_1220_S01 THR14_1229_S01 THR14_1230_S01 THR14_1231_S01 THR14_1232_S01 THR14_1233_S01 THR14_1234_S01 THR14_1235_S01 THR14_1236_S01 THR33_1131_S01 THR33_1139_S01 THR14_1180_S01 THR14_1183_S01 THR14_1184_S01 THR14_1186_S01 THR14_1188_S01 THR14_1189_S01 THR14_1190_S01 THR14_1191_S01 THR14_1192_S01 THR14_1194_S01 THR14_1195_S01 THR14_1198_S01 THR14_1199_S01 THR14_1200_S01 THR14_1201_S01 THR14_1202_S01 THR14_1204_S01 THR14_1205_S01 THR14_1206_S01 THR14_1207_S01 THR14_1209_S01 THR14_1211_S01 THR14_1212_S01 THR14_1214_S01 THR14_1215_S01 THR14_1216_S01 THR14_1217_S01 THR14_1218_S01 THR14_1219_S01 THR14_1221_S01 THR14_1222_S01 THR14_1223_S01 THR14_1237_S01 THR35_1254_S01 THR14_1208_S01 THR14_1210_S01 TH34_1456_S02" %>%
str_split(" ") %>% unlist()
v8_relevant <- v8 %>% select(Gene, any_of(samples_with_expr_to_retrieve))
v8_relevant_longer <- pivot_longer(v8_relevant,
-Gene,
names_to = "Sample_ID",
values_to = "log2TPM1")
write_tsv(v8_relevant_longer, "v8_expr_for_ckcc2.tsv.gz")
v10_relevant <- v10 %>% select(Gene, any_of(samples_with_expr_to_retrieve))
v10_relevant_longer <- pivot_longer(v10_relevant,
-Gene,
names_to = "Sample_ID",
values_to = "log2TPM1")
write_tsv(v10_relevant_longer, "v10_expr_for_ckcc2.tsv.gz")
cohort_thresholds_raw <- left_join(samples_in_cohorts,
expr,
by=c("Sample_ID")) %>%
na.omit() %>% # while I'm missing expression data for some samples
group_by(Gene, cohort) %>%
summarize(q25 = quantile(log2TPM1, 0.25),
median = median(log2TPM1),
q75 = quantile(log2TPM1, 0.75),
IQR = q75-q25,
up_outlier_threshold = q75 + (1.5*IQR))
## `summarise()` has grouped output by 'Gene'. You can override using the
## `.groups` argument.
cohort_thresholds <- cohort_thresholds_raw %>%
pivot_longer(c(-Gene, -cohort),
names_to = "stat") %>%
pivot_wider(names_from = cohort, values_from = value) %>%
mutate(frac_change_in_ped_relative_to_TCGA =
(PEDAYA - TCGA) / TCGA,
frac_change_in_treehouse_relative_to_TCGA =
(Treehouse_pc - TCGA) / Treehouse_pc)
this_sample_id <- "TH34_1455_S01"
this_gene <- "FGFR3"
this_data <- cohort_thresholds %>%
pivot_longer(c(-Gene, -stat),
names_to = "cohort") %>%
filter(Gene == this_gene,
str_detect(cohort, this_sample_id) |
cohort %in% c("PEDAYA", "TCGA", "TH03_TH34", "Treehouse_pc"))
ggplot(this_data) +
geom_point(aes(x=value, y = stat, color = cohort))
expr_relevant_to_this_outlier <- left_join(samples_in_cohorts %>%
filter(str_detect(cohort, this_sample_id) |
cohort %in% c("PEDAYA", "TCGA", "TH03_TH34", "Treehouse_pc")),
expr %>%
filter(Gene == this_gene),
by=c("Sample_ID")) %>%
na.omit() %>% # while I'm missing expression data for some samples
mutate(cohort = as.factor(cohort) %>%
fct_relevel(c("Treehouse_pc", "TCGA", "PEDAYA", "TH03_TH34",
paste(this_sample_id, c("pan_disease_same_diagnosis",
"pan_disease_same_inferred_diagnosis",
"pan_disease_1st_degree",
"pan_disease_1st_and_2nd_degree"))))) %>%
group_by(cohort) %>%
mutate(cohort_with_n = paste0(cohort, " (n=", n(), ")")) %>%
ungroup %>%
arrange(cohort) %>%
mutate(cohort_with_n = factor(cohort_with_n,
levels = unique(cohort_with_n)))
dput(unique(expr_relevant_to_this_outlier$cohort))
## structure(1:8, levels = c("Treehouse_pc", "TCGA", "PEDAYA", "TH03_TH34",
## "TH34_1455_S01 pan_disease_same_diagnosis", "TH34_1455_S01 pan_disease_same_inferred_diagnosis",
## "TH34_1455_S01 pan_disease_1st_degree", "TH34_1455_S01 pan_disease_1st_and_2nd_degree"
## ), class = "factor")
ggplot(expr_relevant_to_this_outlier) +
geom_boxplot(aes(y=cohort_with_n, x=log2TPM1),
#fill = cohort_with_n),
outlier.shape = NA) +
geom_vline(data = expr %>%
filter(Gene == this_gene,
Sample_ID == this_sample_id),
aes(xintercept = log2TPM1),
color = "red") +
facet_col(~cohort_with_n, scales = "free_y") +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank(),
legend.position="none") +
ggtitle(paste(this_gene, "cohorts for", this_sample_id))
# Review all sample-outlier pairs
outliers_to_plot <- outliers %>%
group_by(gene, Sample_ID) %>%
summarize(outlier_relative_to = paste(comparison_cohort, collapse = ", "))
## `summarise()` has grouped output by 'gene'. You can override using the
## `.groups` argument.
plot_distributions <- function(this_gene, this_sample_id, outlier_relative_to) {
#this_sample_id <- "TH34_1455_S01"
# this_gene <- "FGFR3"
this_data <- cohort_thresholds %>%
pivot_longer(c(-Gene, -stat),
names_to = "cohort") %>%
filter(Gene == this_gene,
str_detect(cohort, this_sample_id) |
cohort %in% c("PEDAYA", "TCGA", "TH03_TH34", "Treehouse_pc"))
expected_levels <- c("Treehouse_pc", "TCGA", "PEDAYA", "TH03_TH34",
paste(this_sample_id, c("pan_disease_same_diagnosis",
"pan_disease_same_inferred_diagnosis",
"pan_disease_1st_degree",
"pan_disease_1st_and_2nd_degree")))
expr_relevant_to_this_outlier <- left_join(samples_in_cohorts %>%
filter(str_detect(cohort, this_sample_id) |
cohort %in% c("PEDAYA", "TCGA", "TH03_TH34", "Treehouse_pc")),
expr %>%
filter(Gene == this_gene),
by=c("Sample_ID")) %>%
na.omit() %>% # while I'm missing expression data for some samples
mutate(cohort = as.factor(cohort) %>%
fct_relevel(expected_levels[expected_levels %in% cohort])) %>%
group_by(cohort) %>%
mutate(cohort_with_n = paste0(cohort, " (n=", n(), ")")) %>%
ungroup %>%
arrange(cohort) %>%
mutate(cohort_with_n = factor(cohort_with_n,
levels = unique(cohort_with_n)))
p <- ggplot(expr_relevant_to_this_outlier) +
geom_boxplot(aes(y=cohort_with_n, x=log2TPM1),
#fill = cohort_with_n),
outlier.shape = NA) +
geom_vline(data = expr %>%
filter(Gene == this_gene,
Sample_ID == this_sample_id),
aes(xintercept = log2TPM1),
color = "red") +
facet_col(~cohort_with_n, scales = "free_y") +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank(),
legend.position="none") +
ggtitle(paste(this_gene, "cohorts for", this_sample_id, ";", outlier_relative_to))
return(p)
}
pmap(list(outliers_to_plot$gene, outliers_to_plot$Sample_ID, outliers_to_plot$outlier_relative_to), plot_distributions)
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
##
## [[39]]
##
## [[40]]
##
## [[41]]
##
## [[42]]
##
## [[43]]
##
## [[44]]
##
## [[45]]
##
## [[46]]
##
## [[47]]
##
## [[48]]
##
## [[49]]
##
## [[50]]
##
## [[51]]
##
## [[52]]
##
## [[53]]
##
## [[54]]
##
## [[55]]
##
## [[56]]
##
## [[57]]
##
## [[58]]
##
## [[59]]
##
## [[60]]
##
## [[61]]
##
## [[62]]
##
## [[63]]
##
## [[64]]
##
## [[65]]
##
## [[66]]
##
## [[67]]
##
## [[68]]
##
## [[69]]
##
## [[70]]
##
## [[71]]
##
## [[72]]
##
## [[73]]
##
## [[74]]
##
## [[75]]
##
## [[76]]
##
## [[77]]
##
## [[78]]
##
## [[79]]
##
## [[80]]
##
## [[81]]
##
## [[82]]
##
## [[83]]
##
## [[84]]
##
## [[85]]
##
## [[86]]
##
## [[87]]
##
## [[88]]
##
## [[89]]
##
## [[90]]
##
## [[91]]
##
## [[92]]
##
## [[93]]
##
## [[94]]
##
## [[95]]
##
## [[96]]
##
## [[97]]
##
## [[98]]
##
## [[99]]
##
## [[100]]
##
## [[101]]
##
## [[102]]
##
## [[103]]
##
## [[104]]
##
## [[105]]
##
## [[106]]
##
## [[107]]
##
## [[108]]
##
## [[109]]
##
## [[110]]
##
## [[111]]
##
## [[112]]
##
## [[113]]
##
## [[114]]
##
## [[115]]
##
## [[116]]
##
## [[117]]
##
## [[118]]
##
## [[119]]
##
## [[120]]
##
## [[121]]
##
## [[122]]
##
## [[123]]
##
## [[124]]
##
## [[125]]
##
## [[126]]
##
## [[127]]
##
## [[128]]
##
## [[129]]
##
## [[130]]